Plotly chart for number of UFO sightings per year for all shapes

Author
Affiliation

Georgetown University

Published

April 18, 2023

1 Importing the libraries

Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook+plotly_mimetype'


import warnings
warnings.filterwarnings('ignore')

2 Read the csv file

Code
df = pd.read_csv('../data/ufo_data_nuforc.csv')
df
posted date time city state shape duration summary images
0 12/22/22 12/22/22 09:29:00 Newark DE Light 30 secomds I saw a light that was not flashing moving the... NaN
1 12/22/22 12/21/22 21:33:00 Columbus OH Light 1-2 minutes 4-5 orange balls of light in a straight line NaN
2 12/22/22 12/21/22 04:11:00 Franklin ME Light 2 hours Hovering bright light with satellite lights ex... NaN
3 12/22/22 12/20/22 23:30:00 East Greenwich RI Light 3 seconds maybe even less I was driving the interstate at night with my ... NaN
4 12/22/22 12/20/22 21:58:00 Mentor Headlands OH Cylinder A few minutes Looked cylindrical from the lights, changing c... NaN
... ... ... ... ... ... ... ... ... ...
117824 03/07/98 07/01/79 03:00:00 Chico CA Rectangle 4 - 7 minutes A huge solid black mass, silently glided direc... NaN
117825 03/07/98 12/20/78 17:00:00 Huntington Park CA Disk 20 min. My Father and I watched a silver object, shape... NaN
117826 03/07/98 01/01/75 03:00:00 Jamestown NY Sphere a few minutes I was 13-15. I was in my parent house on Moon ... NaN
117827 03/07/98 08/01/73 04:00:00 Kittery ME Formation 10 Seconds A VERY fast light point object that was viewed... NaN
117828 03/07/98 07/24/73 02:24:00 Blackfoot ID Triangle Approx. 4 minutes Smooth, rounded triangular object, stone grey,... NaN

117829 rows × 9 columns

3 Data pre-processing

Code
# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y', errors='coerce')
# Extract year from date
df['year'] = df['date'].dt.strftime('%Y')
# Sort the dataframe as per date in ascending order
df = df.sort_values(by=['date'], ascending=True)
df
posted date time city state shape duration summary images year
113584 12/16/99 1973-01-01 23:30:00 Chattanooga TN Oval 30 sec. 3 objects in a tri-angle formation also in sta... NaN 1973
90658 03/11/06 1973-01-08 21:00:00 Muscle Shoals AL Oval 20 seconds Glowing orange detailess object with outline s... NaN 1973
104522 11/20/02 1973-01-12 03:00:00 Farmington (SE of, deserted area, Hwy 44) NM Light aprox:2-3 min A large bright light apeared seemingly from no... NaN 1973
82505 02/14/08 1973-01-14 19:00:00 Scranton PA Rectangle 10 seconds Rectangular object moving at a very high rate ... NaN 1973
113583 12/16/99 1973-01-28 23:38:00 Glendora CA Disk 5-10 mins An illuminated Saucer hovers over my back pati... NaN 1973
... ... ... ... ... ... ... ... ... ... ...
16879 08/23/19 2023-01-11 11:15:00 Belleview NE Diamond 8 seconds Terrified NaN 2023
60421 06/08/12 2023-01-11 22:00:00 Stratford CT Unknown 8 to 10 Observed 2 large white lights that appeared to... NaN 2023
114478 10/02/99 2023-01-11 23:00:00 Northern California Coast (in, highway) CA Cigar 3-5 min. We were travelling along the coast of Californ... NaN 2023
113632 11/30/99 2023-01-11 17:30:00 New Martinsville WV Light 5 minutes At 5:30pm EST, I was pulling from a side stree... NaN 2023
7158 03/02/21 2023-01-11 12:00:00 Harding NJ Circle 1 hour Round small yellow saucer idled in/above the w... NaN 2023

117829 rows × 10 columns

4 Taking care of missing data

Code
# Define a function that returns a data-frame of missing data statistics
def missing_val_stats(df):
    # Define columns of the data-frame
    df_stats = pd.DataFrame(columns = ['column', 'unique_val', 'num_unique_val', 'num_unique_val_nona', 
                                       'num_miss', 'pct_miss'])
    tmp = pd.DataFrame()
    
    for c in df.columns:
        # Column
        tmp['column'] = [c]
        # Unique values in the column
        tmp['unique_val'] = [df[c].unique()]
        # Number of unique values in the column
        tmp['num_unique_val'] = len(list(df[c].unique()))
        # Number of unique values in the column without nan
        tmp['num_unique_val_nona'] = int(df[c].nunique())
        # Number of missing values in the column
        tmp['num_miss'] = df[c].isnull().sum()
        # Percentage of missing values in the column
        tmp['pct_miss'] = (df[c].isnull().sum()/ len(df)).round(3)*100
        # Append the values to the dataframe
        df_stats = df_stats.append(tmp)
    
    # Return the created dataframe
    return df_stats
Code
# Check missing value statistics of created dataframe
ufo_missing_stats = missing_val_stats(df)
ufo_missing_stats
column unique_val num_unique_val num_unique_val_nona num_miss pct_miss
0 posted [12/16/99, 03/11/06, 11/20/02, 02/14/08, 10/31... 615 615 0 0.0
0 date [1973-01-01T00:00:00.000000000, 1973-01-08T00:... 12440 12440 0 0.0
0 time [23:30:00, 21:00:00, 03:00:00, 19:00:00, 23:38... 1439 1439 0 0.0
0 city [Chattanooga, Muscle Shoals, Farmington (SE of... 20899 20899 0 0.0
0 state [TN, AL, NM, PA, CA, HI, LA, VA, TX, FL, NC, U... 53 53 0 0.0
0 shape [Oval, Light, Rectangle, Disk, Sphere, Circle,... 39 39 0 0.0
0 duration [30 sec., 20 seconds, aprox:2-3 min, 10 second... 11993 11992 3088 2.6
0 summary [3 objects in a tri-angle formation also in st... 117174 117174 0 0.0
0 images [nan, Yes] 2 1 116504 98.9
0 year [1973, 1974, 1975, 1976, 1977, 1978, 1979, 198... 51 51 0 0.0

Images is a limitation. Not sure if the data is correct for the entry for which the image is Nan.

Code
# Drop redundant columns
df.drop(columns=['posted', 'images'], inplace=True)
df
date time city state shape duration summary year
113584 1973-01-01 23:30:00 Chattanooga TN Oval 30 sec. 3 objects in a tri-angle formation also in sta... 1973
90658 1973-01-08 21:00:00 Muscle Shoals AL Oval 20 seconds Glowing orange detailess object with outline s... 1973
104522 1973-01-12 03:00:00 Farmington (SE of, deserted area, Hwy 44) NM Light aprox:2-3 min A large bright light apeared seemingly from no... 1973
82505 1973-01-14 19:00:00 Scranton PA Rectangle 10 seconds Rectangular object moving at a very high rate ... 1973
113583 1973-01-28 23:38:00 Glendora CA Disk 5-10 mins An illuminated Saucer hovers over my back pati... 1973
... ... ... ... ... ... ... ... ...
16879 2023-01-11 11:15:00 Belleview NE Diamond 8 seconds Terrified 2023
60421 2023-01-11 22:00:00 Stratford CT Unknown 8 to 10 Observed 2 large white lights that appeared to... 2023
114478 2023-01-11 23:00:00 Northern California Coast (in, highway) CA Cigar 3-5 min. We were travelling along the coast of Californ... 2023
113632 2023-01-11 17:30:00 New Martinsville WV Light 5 minutes At 5:30pm EST, I was pulling from a side stree... 2023
7158 2023-01-11 12:00:00 Harding NJ Circle 1 hour Round small yellow saucer idled in/above the w... 2023

117829 rows × 8 columns

Code
# Define a custom function to map dates to seasons
def get_season(date):
    """
    Creates a new column that contains the season information.
    """

    spring_start = pd.Timestamp(year=date.year, month=3, day=20)
    summer_start = pd.Timestamp(year=date.year, month=6, day=20)
    fall_start = pd.Timestamp(year=date.year, month=9, day=22)
    winter_start = pd.Timestamp(year=date.year, month=12, day=2)

    if date < spring_start or date >= winter_start:
        return "Winter"
    elif date < summer_start:
        return "Spring"
    elif date < fall_start:
        return "Summer"
    else:
        return "Fall"
Code
# Apply the function to the "Date" column and create a new "Season" column
df["season"] = df["date"].apply(get_season)
df
date time city state shape duration summary year season
113584 1973-01-01 23:30:00 Chattanooga TN Oval 30 sec. 3 objects in a tri-angle formation also in sta... 1973 Winter
90658 1973-01-08 21:00:00 Muscle Shoals AL Oval 20 seconds Glowing orange detailess object with outline s... 1973 Winter
104522 1973-01-12 03:00:00 Farmington (SE of, deserted area, Hwy 44) NM Light aprox:2-3 min A large bright light apeared seemingly from no... 1973 Winter
82505 1973-01-14 19:00:00 Scranton PA Rectangle 10 seconds Rectangular object moving at a very high rate ... 1973 Winter
113583 1973-01-28 23:38:00 Glendora CA Disk 5-10 mins An illuminated Saucer hovers over my back pati... 1973 Winter
... ... ... ... ... ... ... ... ... ...
16879 2023-01-11 11:15:00 Belleview NE Diamond 8 seconds Terrified 2023 Winter
60421 2023-01-11 22:00:00 Stratford CT Unknown 8 to 10 Observed 2 large white lights that appeared to... 2023 Winter
114478 2023-01-11 23:00:00 Northern California Coast (in, highway) CA Cigar 3-5 min. We were travelling along the coast of Californ... 2023 Winter
113632 2023-01-11 17:30:00 New Martinsville WV Light 5 minutes At 5:30pm EST, I was pulling from a side stree... 2023 Winter
7158 2023-01-11 12:00:00 Harding NJ Circle 1 hour Round small yellow saucer idled in/above the w... 2023 Winter

117829 rows × 9 columns

5 Generate the chart using Plotly

Before generating the plot, we have to group by the dataframe so as to count the total number of occurences.

Since we are going to plot number of occurences per year, lets group by the dataframe by year column

Code
# Filter data to number of sightings per year
df_grouped = df.groupby(['year', 'shape']).size().reset_index(name = 'n_sightings').sort_values(by='n_sightings', ascending=False)
df_grouped['shape'] = df_grouped['shape'].str.lower().replace('other', 'unknown').str.title()
df_grouped.drop(df_grouped[df_grouped['year']=='2023'].index, inplace=True)
df_grouped
year shape n_sightings
883 2014 Light 1733
1011 2020 Light 1646
839 2012 Light 1538
861 2013 Light 1517
904 2015 Light 1458
... ... ... ...
117 1978 Teardrop 1
120 1978 Circle 1
282 1987 Cone 1
268 1986 Egg 1
790 2010 Delta 1

1064 rows × 3 columns

Code
# Generate the plot

# Get unique values of shapes having the number of sightings more than 100
ufo_shapes = df_grouped.groupby(['shape']).n_sightings.sum().sort_values(ascending=False).reset_index()['shape'][:20]

layout = go.Layout(
    yaxis=dict(
        title_standoff=200  # Adjust this value to change the distance between the axis and its label
    ),
    xaxis=dict(
        title_standoff=200,  # Adjust this value to change the distance between the axis and its label
        dtick=5,  # Set tick gap to 5 years
        tickangle=0  # Set tick angle to horizontal
    )
)

# INITIALIZE GRAPH OBJECT
fig = go.Figure(layout=layout)

# Make a trace for each possible plotting scenario
for shape in ufo_shapes:
    # ISOLATE ONE SHAPE OF DATA FOR PLOTTING
    df_shape = df_grouped.query(f"shape == '{shape}'").sort_values(by='year')

    # Add trace to the figure
    fig.add_trace(
        # Specify the type of trace
        go.Scatter(
            x=df_shape['year'],
            y=df_shape['n_sightings'],
            mode='lines',
            name=shape,
            hovertemplate='Year: %{x}<br>Number of UFO sightings: %{y}'
            )
        )

# MAKE FIRST TRACE VISIBLE
fig.data[0].visible = True

button_list = []

all_dict = dict(
        label = 'All',
        method="update",
        args=[{"visible": [True] * len(fig.data)},
                {"title": "Number of UFO sightings over the years for all shapes"}]  # layout attribute
    )
    
all_dict["args"][0]["visible"][0] = True  # Toggle i'th trace to "visible"
    
button_list.append(all_dict)

for i in range(len(fig.data)):
    
    button_dict = dict(
        label = fig.data[i].name,
        method="update",
        args=[{"visible": [False] * len(fig.data)},
                {"title": "Number of UFO sightings over the years for '" + fig.data[i].name + "' shape"}]  # layout attribute
    )
    
    button_dict["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
    
    button_list.append(button_dict)

    
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=button_list,
            direction="down",
            pad={"b": 20, "t": 20},
            showactive=True,
            x=1.0,
            xanchor="left",
            y=1.25,
            yanchor="top"
        ),
    ]
)

# Set figure layout
fig.update_layout(
    title="Number of UFO sightings over the years for all shapes",
    xaxis_title="Year",
    yaxis_title="Number of sightings",
)

fig.show()